
INLINE size_t memtran(void *dest, void *src, size_t size) {
  memcpy(dest, src, size);
  return size;
}
#define pack_field(ptr, field, n)                                              \
  (ptr) += memtran((ptr), (field), sizeof(*(field)) * (n))
#define unpack_field(ptr, field, n)                                            \
  (ptr) += memtran((field), (ptr), sizeof(*(field)) * (n))
static int stag_next = 0x3001;
static int stag_prev = 0x3000;
static int rtag_next = 0x3000;
static int rtag_prev = 0x3001;
#define COMM_ISEND(mpp, size, dir, axis)                                       \
  MPI_Isend((mpp)->send_##dir, (size), MPI_BYTE, (mpp)->dir.axis, stag_##dir,  \
            (mpp)->comm, &(mpp)->send_req_##dir)
#define COMM_IRECV(mpp, dir, axis)                                             \
  MPI_Irecv((mpp)->recv_##dir, (mpp)->max_comm_size, MPI_BYTE,                 \
            (mpp)->dir.axis, rtag_##dir, (mpp)->comm, &(mpp)->recv_req_##dir)
#define COMM_WAITALL(mpp)                                                      \
  {                                                                            \
    MPI_Wait(&((mpp)->send_req_prev), &((mpp)->send_stat_prev));               \
    MPI_Wait(&((mpp)->send_req_next), &((mpp)->send_stat_next));               \
    MPI_Wait(&((mpp)->recv_req_prev), &((mpp)->recv_stat_prev));               \
    MPI_Wait(&((mpp)->recv_req_next), &((mpp)->recv_stat_next));               \
  }
#define UP_ALIGN(ptr, alignment)                                               \
  ((void *)((((long)(ptr)) + (alignment - 1)) & ~(alignment - 1)))
#define MAX_CELL_COMM 49152

size_t pack_cell_forward_most_cpe(void *buf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  pe_get(&cell->basis, &meta, sizeof(cellmeta_t));
  dma_syn();
  void lbuf[MAX_CELL_COMM];
  void *lptr = lbuf;
  *(int *)lptr = meta.natom;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->tag, lptr, sizeof((*cell->tag)) * meta.natom);
  lptr += sizeof((*cell->tag)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->x, lptr, sizeof((*cell->x)) * meta.natom);
  lptr += sizeof((*cell->x)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->q, lptr, sizeof((*cell->q)) * meta.natom);
  lptr += sizeof((*cell->q)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->t, lptr, sizeof((*cell->t)) * meta.natom);
  lptr += sizeof((*cell->t)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->mass, lptr, sizeof((*cell->mass)) * meta.natom);
  lptr += sizeof((*cell->mass)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->rmass, lptr, sizeof((*cell->rmass)) * meta.natom);
  lptr += sizeof((*cell->rmass)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 32);
  pe_put(buf, lbuf, lptr - buf);
  dma_syn();
  return lptr - lbuf;
}
size_t pack_cell_forward_most(void *buf, celldata_t *cell) {
  *(long *)buf = cell->natom;
  void *ptr = buf + 8;
  pack_field(ptr, cell->tag, cell->natom);
  pack_field(ptr, cell->x, cell->natom);
  pack_field(ptr, cell->q, cell->natom);
  pack_field(ptr, cell->t, cell->natom);
  pack_field(ptr, cell->mass, cell->natom);
  pack_field(ptr, cell->rmass, cell->natom);
  return ptr - buf;
}
//['tag', 'x', 'q', 't', 'mass', 'rmass']
size_t unpack_cell_forward_most(void *buf, celldata_t *cell) {
  cell->natom = *(long *)buf;
  void *ptr = buf + 8;
  unpack_field(ptr, cell->tag, cell->natom);
  unpack_field(ptr, cell->x, cell->natom);
  unpack_field(ptr, cell->q, cell->natom);
  unpack_field(ptr, cell->t, cell->natom);
  unpack_field(ptr, cell->mass, cell->natom);
  unpack_field(ptr, cell->rmass, cell->natom);
  return ptr - buf;
}

size_t pack_brick_forward_most(void *buf, cellgrid_t *grid, int xlo, int xhi,
                               int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_most(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_most(void *buf, cellgrid_t *grid, int xlo, int xhi,
                                 int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_most(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

typedef struct pack_brick_param {
  void *buf;
  cellgrid_t *grid;
  int xlo, xhi, ylo, yhi, zlo, zhi;
} pack_brick_param_t;
#ifdef __sw_host__
extern void slave_pack_brick_forward_most_cpe(void *);
extern void slave_unpack_brick_forward_most_cpe(void *);
#endif
#ifdef __sw_slave__
#include "dma_macros_new.h"
#include <qthread_slave.h>
size_t pack_brick_forward_most_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += pack_cell_forward_most(ptr, cell);
      }
    }
  }
  return ptr - buf;
}
size_t unpack_brick_forward_most_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += unpack_cell_forward_most(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

#endif

void forward_comm_most(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_forward_most(mpp->send_prev, grid, 0, nlocal->x, 0,
                                       nlocal->y, 0, nn);
  nsend_next = pack_brick_forward_most(mpp->send_next, grid, 0, nlocal->x, 0,
                                       nlocal->y, nlocal->z - nn, nlocal->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_forward_most(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y,
                            lo->z, lo->z + nn);
  unpack_brick_forward_most(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y,
                            hi->z - nn, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_forward_most(mpp->send_prev, grid, 0, nlocal->x, 0,
                                       nn, lo->z, hi->z);
  nsend_next = pack_brick_forward_most(mpp->send_next, grid, 0, nlocal->x,
                                       nlocal->y - nn, nlocal->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_forward_most(mpp->recv_prev, grid, 0, nlocal->x, lo->y,
                            lo->y + nn, lo->z, hi->z);
  unpack_brick_forward_most(mpp->recv_next, grid, 0, nlocal->x, hi->y - nn,
                            hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_forward_most(mpp->send_prev, grid, 0, nn, lo->y,
                                       hi->y, lo->z, hi->z);
  nsend_next = pack_brick_forward_most(mpp->send_next, grid, nlocal->x - nn,
                                       nlocal->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_forward_most(mpp->recv_prev, grid, lo->x, lo->x + nn, lo->y,
                            hi->y, lo->z, hi->z);
  unpack_brick_forward_most(mpp->recv_next, grid, hi->x - nn, hi->x, lo->y,
                            hi->y, lo->z, hi->z);
}

size_t pack_cell_forward_x_cpe(void *buf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  pe_get(&cell->basis, &meta, sizeof(cellmeta_t));
  dma_syn();
  void lbuf[MAX_CELL_COMM];
  void *lptr = lbuf;
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->x, lptr, sizeof((*cell->x)) * meta.natom);
  lptr += sizeof((*cell->x)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 32);
  pe_put(buf, lbuf, lptr - buf);
  dma_syn();
  return lptr - lbuf;
}
size_t pack_cell_forward_x(void *buf, celldata_t *cell) {
  void *ptr = buf;
  pack_field(ptr, cell->x, cell->natom);
  return ptr - buf;
}
//['x']
size_t unpack_cell_forward_x(void *buf, celldata_t *cell) {
  void *ptr = buf;
  unpack_field(ptr, cell->x, cell->natom);
  return ptr - buf;
}

size_t pack_brick_forward_x(void *buf, cellgrid_t *grid, int xlo, int xhi,
                            int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_x(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_x(void *buf, cellgrid_t *grid, int xlo, int xhi,
                              int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_x(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

typedef struct pack_brick_param {
  void *buf;
  cellgrid_t *grid;
  int xlo, xhi, ylo, yhi, zlo, zhi;
} pack_brick_param_t;
#ifdef __sw_host__
extern void slave_pack_brick_forward_x_cpe(void *);
extern void slave_unpack_brick_forward_x_cpe(void *);
#endif
#ifdef __sw_slave__
#include "dma_macros_new.h"
#include <qthread_slave.h>
size_t pack_brick_forward_x_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += pack_cell_forward_x(ptr, cell);
      }
    }
  }
  return ptr - buf;
}
size_t unpack_brick_forward_x_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += unpack_cell_forward_x(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

#endif

void forward_comm_x(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_forward_x(mpp->send_prev, grid, 0, nlocal->x, 0,
                                    nlocal->y, 0, nn);
  nsend_next = pack_brick_forward_x(mpp->send_next, grid, 0, nlocal->x, 0,
                                    nlocal->y, nlocal->z - nn, nlocal->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_forward_x(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y,
                         lo->z, lo->z + nn);
  unpack_brick_forward_x(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y,
                         hi->z - nn, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_forward_x(mpp->send_prev, grid, 0, nlocal->x, 0, nn,
                                    lo->z, hi->z);
  nsend_next = pack_brick_forward_x(mpp->send_next, grid, 0, nlocal->x,
                                    nlocal->y - nn, nlocal->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_forward_x(mpp->recv_prev, grid, 0, nlocal->x, lo->y, lo->y + nn,
                         lo->z, hi->z);
  unpack_brick_forward_x(mpp->recv_next, grid, 0, nlocal->x, hi->y - nn, hi->y,
                         lo->z, hi->z);

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_forward_x(mpp->send_prev, grid, 0, nn, lo->y, hi->y,
                                    lo->z, hi->z);
  nsend_next = pack_brick_forward_x(mpp->send_next, grid, nlocal->x - nn,
                                    nlocal->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_forward_x(mpp->recv_prev, grid, lo->x, lo->x + nn, lo->y, hi->y,
                         lo->z, hi->z);
  unpack_brick_forward_x(mpp->recv_next, grid, hi->x - nn, hi->x, lo->y, hi->y,
                         lo->z, hi->z);
}

size_t pack_cell_reverse_f_cpe(void *buf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  pe_get(&cell->basis, &meta, sizeof(cellmeta_t));
  dma_syn();
  void lbuf[MAX_CELL_COMM];
  void *lptr = lbuf;
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->f, lptr, sizeof((*cell->f)) * meta.natom);
  lptr += sizeof((*cell->f)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 32);
  pe_put(buf, lbuf, lptr - buf);
  dma_syn();
  return lptr - lbuf;
}
size_t pack_cell_reverse_f(void *buf, celldata_t *cell) {
  void *ptr = buf;
  pack_field(ptr, cell->f, cell->natom);
  return ptr - buf;
}
//['f']
size_t unpack_cell_reverse_f(void *buf, celldata_t *cell) {
  void *ptr = buf;
  vec<real> *f_buf = ptr;
  for (int i = 0; i < cell->natom; i++) {
    vecaddv(cell->f[i], cell->f[i], f_buf[i]);
  }
  ptr += sizeof(*(cell->f)) * cell->natom;
  return ptr - buf;
}

size_t pack_brick_reverse_f(void *buf, cellgrid_t *grid, int xlo, int xhi,
                            int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_reverse_f(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_reverse_f(void *buf, cellgrid_t *grid, int xlo, int xhi,
                              int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_reverse_f(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

typedef struct pack_brick_param {
  void *buf;
  cellgrid_t *grid;
  int xlo, xhi, ylo, yhi, zlo, zhi;
} pack_brick_param_t;
#ifdef __sw_host__
extern void slave_pack_brick_reverse_f_cpe(void *);
extern void slave_unpack_brick_reverse_f_cpe(void *);
#endif
#ifdef __sw_slave__
#include "dma_macros_new.h"
#include <qthread_slave.h>
size_t pack_brick_reverse_f_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += pack_cell_reverse_f(ptr, cell);
      }
    }
  }
  return ptr - buf;
}
size_t unpack_brick_reverse_f_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += unpack_cell_reverse_f(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

#endif

void reverse_comm_f(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_reverse_f(mpp->send_prev, grid, lo->x, lo->x + nn,
                                    lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_reverse_f(mpp->send_next, grid, hi->x - nn, hi->x,
                                    lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_f(mpp->recv_prev, grid, 0, nn, lo->y, hi->y, lo->z,
                         hi->z);
  unpack_brick_reverse_f(mpp->recv_next, grid, nlocal->x - nn, nlocal->x, lo->y,
                         hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_reverse_f(mpp->send_prev, grid, 0, nlocal->x, lo->y,
                                    lo->y + nn, lo->z, hi->z);
  nsend_next = pack_brick_reverse_f(mpp->send_next, grid, 0, nlocal->x,
                                    hi->y - nn, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_f(mpp->recv_prev, grid, 0, nlocal->x, 0, nn, lo->z,
                         hi->z);
  unpack_brick_reverse_f(mpp->recv_next, grid, 0, nlocal->x, nlocal->y - nn,
                         nlocal->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_reverse_f(mpp->send_prev, grid, 0, nlocal->x, 0,
                                    nlocal->y, lo->z, lo->z + nn);
  nsend_next = pack_brick_reverse_f(mpp->send_next, grid, 0, nlocal->x, 0,
                                    nlocal->y, hi->z - nn, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_f(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y, 0,
                         nn);
  unpack_brick_reverse_f(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y,
                         nlocal->z - nn, nlocal->z);
}

size_t pack_cell_forward_v_cpe(void *buf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  pe_get(&cell->basis, &meta, sizeof(cellmeta_t));
  dma_syn();
  void lbuf[MAX_CELL_COMM];
  void *lptr = lbuf;
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->v, lptr, sizeof((*cell->v)) * meta.natom);
  lptr += sizeof((*cell->v)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 32);
  pe_put(buf, lbuf, lptr - buf);
  dma_syn();
  return lptr - lbuf;
}
size_t pack_cell_forward_v(void *buf, celldata_t *cell) {
  void *ptr = buf;
  pack_field(ptr, cell->v, cell->natom);
  return ptr - buf;
}
//['v']
size_t unpack_cell_forward_v(void *buf, celldata_t *cell) {
  void *ptr = buf;
  unpack_field(ptr, cell->v, cell->natom);
  return ptr - buf;
}

size_t pack_brick_forward_v(void *buf, cellgrid_t *grid, int xlo, int xhi,
                            int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_v(void *buf, cellgrid_t *grid, int xlo, int xhi,
                              int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

typedef struct pack_brick_param {
  void *buf;
  cellgrid_t *grid;
  int xlo, xhi, ylo, yhi, zlo, zhi;
} pack_brick_param_t;
#ifdef __sw_host__
extern void slave_pack_brick_forward_v_cpe(void *);
extern void slave_unpack_brick_forward_v_cpe(void *);
#endif
#ifdef __sw_slave__
#include "dma_macros_new.h"
#include <qthread_slave.h>
size_t pack_brick_forward_v_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += pack_cell_forward_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}
size_t unpack_brick_forward_v_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += unpack_cell_forward_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

#endif

void forward_comm_v(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_forward_v(mpp->send_prev, grid, 0, nlocal->x, 0,
                                    nlocal->y, 0, nn);
  nsend_next = pack_brick_forward_v(mpp->send_next, grid, 0, nlocal->x, 0,
                                    nlocal->y, nlocal->z - nn, nlocal->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_forward_v(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y,
                         lo->z, lo->z + nn);
  unpack_brick_forward_v(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y,
                         hi->z - nn, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_forward_v(mpp->send_prev, grid, 0, nlocal->x, 0, nn,
                                    lo->z, hi->z);
  nsend_next = pack_brick_forward_v(mpp->send_next, grid, 0, nlocal->x,
                                    nlocal->y - nn, nlocal->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_forward_v(mpp->recv_prev, grid, 0, nlocal->x, lo->y, lo->y + nn,
                         lo->z, hi->z);
  unpack_brick_forward_v(mpp->recv_next, grid, 0, nlocal->x, hi->y - nn, hi->y,
                         lo->z, hi->z);

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_forward_v(mpp->send_prev, grid, 0, nn, lo->y, hi->y,
                                    lo->z, hi->z);
  nsend_next = pack_brick_forward_v(mpp->send_next, grid, nlocal->x - nn,
                                    nlocal->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_forward_v(mpp->recv_prev, grid, lo->x, lo->x + nn, lo->y, hi->y,
                         lo->z, hi->z);
  unpack_brick_forward_v(mpp->recv_next, grid, hi->x - nn, hi->x, lo->y, hi->y,
                         lo->z, hi->z);
}

size_t pack_cell_reverse_v_cpe(void *buf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  pe_get(&cell->basis, &meta, sizeof(cellmeta_t));
  dma_syn();
  void lbuf[MAX_CELL_COMM];
  void *lptr = lbuf;
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->v, lptr, sizeof((*cell->v)) * meta.natom);
  lptr += sizeof((*cell->v)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 32);
  pe_put(buf, lbuf, lptr - buf);
  dma_syn();
  return lptr - lbuf;
}
size_t pack_cell_reverse_v(void *buf, celldata_t *cell) {
  void *ptr = buf;
  pack_field(ptr, cell->v, cell->natom);
  return ptr - buf;
}
//['v']
size_t unpack_cell_reverse_v(void *buf, celldata_t *cell) {
  void *ptr = buf;
  vec<real> *v_buf = ptr;
  for (int i = 0; i < cell->natom; i++) {
    vecaddv(cell->v[i], cell->v[i], v_buf[i]);
  }
  ptr += sizeof(*(cell->v)) * cell->natom;
  return ptr - buf;
}

size_t pack_brick_reverse_v(void *buf, cellgrid_t *grid, int xlo, int xhi,
                            int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_reverse_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_reverse_v(void *buf, cellgrid_t *grid, int xlo, int xhi,
                              int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_reverse_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

typedef struct pack_brick_param {
  void *buf;
  cellgrid_t *grid;
  int xlo, xhi, ylo, yhi, zlo, zhi;
} pack_brick_param_t;
#ifdef __sw_host__
extern void slave_pack_brick_reverse_v_cpe(void *);
extern void slave_unpack_brick_reverse_v_cpe(void *);
#endif
#ifdef __sw_slave__
#include "dma_macros_new.h"
#include <qthread_slave.h>
size_t pack_brick_reverse_v_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += pack_cell_reverse_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}
size_t unpack_brick_reverse_v_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += unpack_cell_reverse_v(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

#endif

void reverse_comm_v(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_reverse_v(mpp->send_prev, grid, lo->x, lo->x + nn,
                                    lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_reverse_v(mpp->send_next, grid, hi->x - nn, hi->x,
                                    lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_v(mpp->recv_prev, grid, 0, nn, lo->y, hi->y, lo->z,
                         hi->z);
  unpack_brick_reverse_v(mpp->recv_next, grid, nlocal->x - nn, nlocal->x, lo->y,
                         hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_reverse_v(mpp->send_prev, grid, 0, nlocal->x, lo->y,
                                    lo->y + nn, lo->z, hi->z);
  nsend_next = pack_brick_reverse_v(mpp->send_next, grid, 0, nlocal->x,
                                    hi->y - nn, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_v(mpp->recv_prev, grid, 0, nlocal->x, 0, nn, lo->z,
                         hi->z);
  unpack_brick_reverse_v(mpp->recv_next, grid, 0, nlocal->x, nlocal->y - nn,
                         nlocal->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_reverse_v(mpp->send_prev, grid, 0, nlocal->x, 0,
                                    nlocal->y, lo->z, lo->z + nn);
  nsend_next = pack_brick_reverse_v(mpp->send_next, grid, 0, nlocal->x, 0,
                                    nlocal->y, hi->z - nn, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_v(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y, 0,
                         nn);
  unpack_brick_reverse_v(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y,
                         nlocal->z - nn, nlocal->z);
}

size_t pack_cell_forward_export_list_cpe(void *buf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  pe_get(&cell->basis, &meta, sizeof(cellmeta_t));
  dma_syn();
  void lbuf[MAX_CELL_COMM];
  void *lptr = lbuf;
  *(int *)lptr = meta.nexport;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->x + (CELL_CAP - meta.nexport),
         sizeof((*cell->x)) * meta.nexport);
  lptr += sizeof((*cell->x)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->q + (CELL_CAP - meta.nexport),
         sizeof((*cell->q)) * meta.nexport);
  lptr += sizeof((*cell->q)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->tag + (CELL_CAP - meta.nexport),
         sizeof((*cell->tag)) * meta.nexport);
  lptr += sizeof((*cell->tag)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->t + (CELL_CAP - meta.nexport),
         sizeof((*cell->t)) * meta.nexport);
  lptr += sizeof((*cell->t)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->v + (CELL_CAP - meta.nexport),
         sizeof((*cell->v)) * meta.nexport);
  lptr += sizeof((*cell->v)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->mass + (CELL_CAP - meta.nexport),
         sizeof((*cell->mass)) * meta.nexport);
  lptr += sizeof((*cell->mass)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  *(int *)lptr = meta.nbonded_export;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 4);
  *(int *)lptr = meta.nchain2_export;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 4);
  *(int *)lptr = meta.nscal_export;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 4);
  *(int *)lptr = meta.nexcl_export;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 4);
  *(int *)lptr = meta.nimpr_export;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->first_bonded + (CELL_CAP - meta.nexport),
         sizeof((*cell->first_bonded)) * (meta.nexport + 1));
  lptr += sizeof((*cell->first_bonded)) * (meta.nexport + 1);
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->first_chain2 + (CELL_CAP - meta.nexport),
         sizeof((*cell->first_chain2)) * (meta.nexport + 1));
  lptr += sizeof((*cell->first_chain2)) * (meta.nexport + 1);
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->first_scal_atom + (CELL_CAP - meta.nexport),
         sizeof((*cell->first_scal_atom)) * (meta.nexport + 1));
  lptr += sizeof((*cell->first_scal_atom)) * (meta.nexport + 1);
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->first_excl_atom + (CELL_CAP - meta.nexport),
         sizeof((*cell->first_excl_atom)) * (meta.nexport + 1));
  lptr += sizeof((*cell->first_excl_atom)) * (meta.nexport + 1);
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->first_impr + (CELL_CAP - meta.nexport),
         sizeof((*cell->first_impr)) * (meta.nexport + 1));
  lptr += sizeof((*cell->first_impr)) * (meta.nexport + 1);
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->bonded_tag + (MAX_BONDED_CELL - meta.nbonded_export),
         sizeof((*cell->bonded_tag)) * meta.nbonded_export);
  lptr += sizeof((*cell->bonded_tag)) * meta.nbonded_export;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->chain2_tag + (MAX_CHAIN2_CELL - meta.nchain2_export),
         sizeof((*cell->chain2_tag)) * meta.nchain2_export);
  lptr += sizeof((*cell->chain2_tag)) * meta.nchain2_export;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->excl_tag + (MAX_EXCL_CELL - meta.nexcl_export),
         sizeof((*cell->excl_tag)) * meta.nexcl_export);
  lptr += sizeof((*cell->excl_tag)) * meta.nexcl_export;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->scal_tag + (MAX_SCAL_CELL - meta.nscal_export),
         sizeof((*cell->scal_tag)) * meta.nscal_export);
  lptr += sizeof((*cell->scal_tag)) * meta.nscal_export;
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->impr_idx + (MAX_IMPR_CELL - meta.nimpr_export),
         sizeof((*cell->impr_idx)) * meta.nimpr_export);
  lptr += sizeof((*cell->impr_idx)) * meta.nimpr_export;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->shake + (CELL_CAP - meta.nexport),
         sizeof((*cell->shake)) * meta.nexport);
  lptr += sizeof((*cell->shake)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 32);
  pe_put(buf, lbuf, lptr - buf);
  dma_syn();
  return lptr - lbuf;
}
size_t pack_cell_forward_export_list(void *buf, celldata_t *cell) {
  *(long *)buf = cell->nexport;
  void *ptr = buf + 8;
  pack_field(ptr, cell->x + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->q + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->tag + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->t + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->v + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->mass + (CELL_CAP - cell->nexport), cell->nexport);
  *(long *)ptr = cell->nbonded_export;
  ptr += 8;
  *(long *)ptr = cell->nchain2_export;
  ptr += 8;
  *(long *)ptr = cell->nscal_export;
  ptr += 8;
  *(long *)ptr = cell->nexcl_export;
  ptr += 8;
  *(long *)ptr = cell->nimpr_export;
  ptr += 8;
  pack_field(ptr, cell->first_bonded + (CELL_CAP - cell->nexport),
             cell->nexport + 1);
  pack_field(ptr, cell->first_chain2 + (CELL_CAP - cell->nexport),
             cell->nexport + 1);
  pack_field(ptr, cell->first_scal_atom + (CELL_CAP - cell->nexport),
             cell->nexport + 1);
  pack_field(ptr, cell->first_excl_atom + (CELL_CAP - cell->nexport),
             cell->nexport + 1);
  pack_field(ptr, cell->first_impr + (CELL_CAP - cell->nexport),
             cell->nexport + 1);
  pack_field(ptr, cell->bonded_tag + (MAX_BONDED_CELL - cell->nbonded_export),
             cell->nbonded_export);
  pack_field(ptr, cell->chain2_tag + (MAX_CHAIN2_CELL - cell->nchain2_export),
             cell->nchain2_export);
  pack_field(ptr, cell->excl_tag + (MAX_EXCL_CELL - cell->nexcl_export),
             cell->nexcl_export);
  pack_field(ptr, cell->scal_tag + (MAX_SCAL_CELL - cell->nscal_export),
             cell->nscal_export);
  pack_field(ptr, cell->impr_idx + (MAX_IMPR_CELL - cell->nimpr_export),
             cell->nimpr_export);
  pack_field(ptr, cell->shake + (CELL_CAP - cell->nexport), cell->nexport);
  return ptr - buf;
}
//['x', 'q', 'tag', 't', 'v', 'mass', 'nbonded_export', 'nchain2_export',
//'nscal_export', 'nexcl_export', 'nimpr_export', 'first_bonded',
//'first_chain2', 'first_scal_atom', 'first_excl_atom', 'first_impr',
//'bonded_tag', 'chain2_tag', 'excl_tag', 'scal_tag', 'impr_idx', 'shake']
size_t unpack_cell_forward_export_list(void *buf, celldata_t *cell) {
  cell->nexport = *(long *)buf;
  void *ptr = buf + 8;
  unpack_field(ptr, cell->x + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->q + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->tag + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->t + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->v + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->mass + (CELL_CAP - cell->nexport), cell->nexport);
  cell->nbonded_export = *(long *)ptr;
  ptr += 8;
  cell->nchain2_export = *(long *)ptr;
  ptr += 8;
  cell->nscal_export = *(long *)ptr;
  ptr += 8;
  cell->nexcl_export = *(long *)ptr;
  ptr += 8;
  cell->nimpr_export = *(long *)ptr;
  ptr += 8;
  unpack_field(ptr, cell->first_bonded + (CELL_CAP - cell->nexport),
               cell->nexport + 1);
  unpack_field(ptr, cell->first_chain2 + (CELL_CAP - cell->nexport),
               cell->nexport + 1);
  unpack_field(ptr, cell->first_scal_atom + (CELL_CAP - cell->nexport),
               cell->nexport + 1);
  unpack_field(ptr, cell->first_excl_atom + (CELL_CAP - cell->nexport),
               cell->nexport + 1);
  unpack_field(ptr, cell->first_impr + (CELL_CAP - cell->nexport),
               cell->nexport + 1);
  unpack_field(ptr, cell->bonded_tag + (MAX_BONDED_CELL - cell->nbonded_export),
               cell->nbonded_export);
  unpack_field(ptr, cell->chain2_tag + (MAX_CHAIN2_CELL - cell->nchain2_export),
               cell->nchain2_export);
  unpack_field(ptr, cell->excl_tag + (MAX_EXCL_CELL - cell->nexcl_export),
               cell->nexcl_export);
  unpack_field(ptr, cell->scal_tag + (MAX_SCAL_CELL - cell->nscal_export),
               cell->nscal_export);
  unpack_field(ptr, cell->impr_idx + (MAX_IMPR_CELL - cell->nimpr_export),
               cell->nimpr_export);
  unpack_field(ptr, cell->shake + (CELL_CAP - cell->nexport), cell->nexport);
  return ptr - buf;
}

size_t pack_brick_forward_export_list(void *buf, cellgrid_t *grid, int xlo,
                                      int xhi, int ylo, int yhi, int zlo,
                                      int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_export_list(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_export_list(void *buf, cellgrid_t *grid, int xlo,
                                        int xhi, int ylo, int yhi, int zlo,
                                        int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_export_list(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

typedef struct pack_brick_param {
  void *buf;
  cellgrid_t *grid;
  int xlo, xhi, ylo, yhi, zlo, zhi;
} pack_brick_param_t;
#ifdef __sw_host__
extern void slave_pack_brick_forward_export_list_cpe(void *);
extern void slave_unpack_brick_forward_export_list_cpe(void *);
#endif
#ifdef __sw_slave__
#include "dma_macros_new.h"
#include <qthread_slave.h>
size_t pack_brick_forward_export_list_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += pack_cell_forward_export_list(ptr, cell);
      }
    }
  }
  return ptr - buf;
}
size_t unpack_brick_forward_export_list_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += unpack_cell_forward_export_list(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

#endif

void forward_comm_export_list(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_forward_export_list(mpp->send_prev, grid, 0,
                                              nlocal->x, 0, nlocal->y, 0, nn);
  nsend_next =
      pack_brick_forward_export_list(mpp->send_next, grid, 0, nlocal->x, 0,
                                     nlocal->y, nlocal->z - nn, nlocal->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_forward_export_list(mpp->recv_prev, grid, 0, nlocal->x, 0,
                                   nlocal->y, lo->z, lo->z + nn);
  unpack_brick_forward_export_list(mpp->recv_next, grid, 0, nlocal->x, 0,
                                   nlocal->y, hi->z - nn, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_forward_export_list(mpp->send_prev, grid, 0,
                                              nlocal->x, 0, nn, lo->z, hi->z);
  nsend_next =
      pack_brick_forward_export_list(mpp->send_next, grid, 0, nlocal->x,
                                     nlocal->y - nn, nlocal->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_forward_export_list(mpp->recv_prev, grid, 0, nlocal->x, lo->y,
                                   lo->y + nn, lo->z, hi->z);
  unpack_brick_forward_export_list(mpp->recv_next, grid, 0, nlocal->x,
                                   hi->y - nn, hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_forward_export_list(mpp->send_prev, grid, 0, nn,
                                              lo->y, hi->y, lo->z, hi->z);
  nsend_next =
      pack_brick_forward_export_list(mpp->send_next, grid, nlocal->x - nn,
                                     nlocal->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_forward_export_list(mpp->recv_prev, grid, lo->x, lo->x + nn,
                                   lo->y, hi->y, lo->z, hi->z);
  unpack_brick_forward_export_list(mpp->recv_next, grid, hi->x - nn, hi->x,
                                   lo->y, hi->y, lo->z, hi->z);
}

size_t pack_cell_forward_export_list_cg_cpe(void *buf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  pe_get(&cell->basis, &meta, sizeof(cellmeta_t));
  dma_syn();
  void lbuf[MAX_CELL_COMM];
  void *lptr = lbuf;
  *(int *)lptr = meta.nexport;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->x + (CELL_CAP - meta.nexport),
         sizeof((*cell->x)) * meta.nexport);
  lptr += sizeof((*cell->x)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->q + (CELL_CAP - meta.nexport),
         sizeof((*cell->q)) * meta.nexport);
  lptr += sizeof((*cell->q)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->tag + (CELL_CAP - meta.nexport),
         sizeof((*cell->tag)) * meta.nexport);
  lptr += sizeof((*cell->tag)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->t + (CELL_CAP - meta.nexport),
         sizeof((*cell->t)) * meta.nexport);
  lptr += sizeof((*cell->t)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->v + (CELL_CAP - meta.nexport),
         sizeof((*cell->v)) * meta.nexport);
  lptr += sizeof((*cell->v)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->mass + (CELL_CAP - meta.nexport),
         sizeof((*cell->mass)) * meta.nexport);
  lptr += sizeof((*cell->mass)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  *(int *)lptr = meta.nbonded_export;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 4);
  *(int *)lptr = meta.nchain2_export;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 4);
  *(int *)lptr = meta.nscal_export;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 4);
  *(int *)lptr = meta.nexcl_export;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 4);
  *(int *)lptr = meta.nimpr_export;
  lptr += 4;
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->first_bonded + (CELL_CAP - meta.nexport),
         sizeof((*cell->first_bonded)) * (meta.nexport + 1));
  lptr += sizeof((*cell->first_bonded)) * (meta.nexport + 1);
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->first_chain2 + (CELL_CAP - meta.nexport),
         sizeof((*cell->first_chain2)) * (meta.nexport + 1));
  lptr += sizeof((*cell->first_chain2)) * (meta.nexport + 1);
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->first_scal_atom + (CELL_CAP - meta.nexport),
         sizeof((*cell->first_scal_atom)) * (meta.nexport + 1));
  lptr += sizeof((*cell->first_scal_atom)) * (meta.nexport + 1);
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->first_excl_atom + (CELL_CAP - meta.nexport),
         sizeof((*cell->first_excl_atom)) * (meta.nexport + 1));
  lptr += sizeof((*cell->first_excl_atom)) * (meta.nexport + 1);
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->first_impr + (CELL_CAP - meta.nexport),
         sizeof((*cell->first_impr)) * (meta.nexport + 1));
  lptr += sizeof((*cell->first_impr)) * (meta.nexport + 1);
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->bonded_tag + (MAX_BONDED_CELL - meta.nbonded_export),
         sizeof((*cell->bonded_tag)) * meta.nbonded_export);
  lptr += sizeof((*cell->bonded_tag)) * meta.nbonded_export;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->chain2_tag + (MAX_CHAIN2_CELL - meta.nchain2_export),
         sizeof((*cell->chain2_tag)) * meta.nchain2_export);
  lptr += sizeof((*cell->chain2_tag)) * meta.nchain2_export;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->excl_tag + (MAX_EXCL_CELL - meta.nexcl_export),
         sizeof((*cell->excl_tag)) * meta.nexcl_export);
  lptr += sizeof((*cell->excl_tag)) * meta.nexcl_export;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->scal_tag + (MAX_SCAL_CELL - meta.nscal_export),
         sizeof((*cell->scal_tag)) * meta.nscal_export);
  lptr += sizeof((*cell->scal_tag)) * meta.nscal_export;
  dma_syn();
  lptr = UP_ALIGN(lptr, 4);
  pe_get(cell->impr_idx + (MAX_IMPR_CELL - meta.nimpr_export),
         sizeof((*cell->impr_idx)) * meta.nimpr_export);
  lptr += sizeof((*cell->impr_idx)) * meta.nimpr_export;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->shake + (CELL_CAP - meta.nexport),
         sizeof((*cell->shake)) * meta.nexport);
  lptr += sizeof((*cell->shake)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->shake_tmp + (CELL_CAP - meta.nexport),
         sizeof((*cell->shake_tmp)) * meta.nexport);
  lptr += sizeof((*cell->shake_tmp)) * meta.nexport;
  dma_syn();
  lptr = UP_ALIGN(lptr, 32);
  pe_put(buf, lbuf, lptr - buf);
  dma_syn();
  return lptr - lbuf;
}
size_t pack_cell_forward_export_list_cg(void *buf, celldata_t *cell) {
  *(long *)buf = cell->nexport;
  void *ptr = buf + 8;
  pack_field(ptr, cell->x + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->q + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->tag + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->t + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->v + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->mass + (CELL_CAP - cell->nexport), cell->nexport);
  *(long *)ptr = cell->nbonded_export;
  ptr += 8;
  *(long *)ptr = cell->nchain2_export;
  ptr += 8;
  *(long *)ptr = cell->nscal_export;
  ptr += 8;
  *(long *)ptr = cell->nexcl_export;
  ptr += 8;
  *(long *)ptr = cell->nimpr_export;
  ptr += 8;
  pack_field(ptr, cell->first_bonded + (CELL_CAP - cell->nexport),
             cell->nexport + 1);
  pack_field(ptr, cell->first_chain2 + (CELL_CAP - cell->nexport),
             cell->nexport + 1);
  pack_field(ptr, cell->first_scal_atom + (CELL_CAP - cell->nexport),
             cell->nexport + 1);
  pack_field(ptr, cell->first_excl_atom + (CELL_CAP - cell->nexport),
             cell->nexport + 1);
  pack_field(ptr, cell->first_impr + (CELL_CAP - cell->nexport),
             cell->nexport + 1);
  pack_field(ptr, cell->bonded_tag + (MAX_BONDED_CELL - cell->nbonded_export),
             cell->nbonded_export);
  pack_field(ptr, cell->chain2_tag + (MAX_CHAIN2_CELL - cell->nchain2_export),
             cell->nchain2_export);
  pack_field(ptr, cell->excl_tag + (MAX_EXCL_CELL - cell->nexcl_export),
             cell->nexcl_export);
  pack_field(ptr, cell->scal_tag + (MAX_SCAL_CELL - cell->nscal_export),
             cell->nscal_export);
  pack_field(ptr, cell->impr_idx + (MAX_IMPR_CELL - cell->nimpr_export),
             cell->nimpr_export);
  pack_field(ptr, cell->shake + (CELL_CAP - cell->nexport), cell->nexport);
  pack_field(ptr, cell->shake_tmp + (CELL_CAP - cell->nexport), cell->nexport);
  return ptr - buf;
}
//['x', 'q', 'tag', 't', 'v', 'mass', 'nbonded_export', 'nchain2_export',
//'nscal_export', 'nexcl_export', 'nimpr_export', 'first_bonded',
//'first_chain2', 'first_scal_atom', 'first_excl_atom', 'first_impr',
//'bonded_tag', 'chain2_tag', 'excl_tag', 'scal_tag', 'impr_idx', 'shake',
//'shake_tmp']
size_t unpack_cell_forward_export_list_cg(void *buf, celldata_t *cell) {
  cell->nexport = *(long *)buf;
  void *ptr = buf + 8;
  unpack_field(ptr, cell->x + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->q + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->tag + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->t + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->v + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->mass + (CELL_CAP - cell->nexport), cell->nexport);
  cell->nbonded_export = *(long *)ptr;
  ptr += 8;
  cell->nchain2_export = *(long *)ptr;
  ptr += 8;
  cell->nscal_export = *(long *)ptr;
  ptr += 8;
  cell->nexcl_export = *(long *)ptr;
  ptr += 8;
  cell->nimpr_export = *(long *)ptr;
  ptr += 8;
  unpack_field(ptr, cell->first_bonded + (CELL_CAP - cell->nexport),
               cell->nexport + 1);
  unpack_field(ptr, cell->first_chain2 + (CELL_CAP - cell->nexport),
               cell->nexport + 1);
  unpack_field(ptr, cell->first_scal_atom + (CELL_CAP - cell->nexport),
               cell->nexport + 1);
  unpack_field(ptr, cell->first_excl_atom + (CELL_CAP - cell->nexport),
               cell->nexport + 1);
  unpack_field(ptr, cell->first_impr + (CELL_CAP - cell->nexport),
               cell->nexport + 1);
  unpack_field(ptr, cell->bonded_tag + (MAX_BONDED_CELL - cell->nbonded_export),
               cell->nbonded_export);
  unpack_field(ptr, cell->chain2_tag + (MAX_CHAIN2_CELL - cell->nchain2_export),
               cell->nchain2_export);
  unpack_field(ptr, cell->excl_tag + (MAX_EXCL_CELL - cell->nexcl_export),
               cell->nexcl_export);
  unpack_field(ptr, cell->scal_tag + (MAX_SCAL_CELL - cell->nscal_export),
               cell->nscal_export);
  unpack_field(ptr, cell->impr_idx + (MAX_IMPR_CELL - cell->nimpr_export),
               cell->nimpr_export);
  unpack_field(ptr, cell->shake + (CELL_CAP - cell->nexport), cell->nexport);
  unpack_field(ptr, cell->shake_tmp + (CELL_CAP - cell->nexport),
               cell->nexport);
  return ptr - buf;
}

size_t pack_brick_forward_export_list_cg(void *buf, cellgrid_t *grid, int xlo,
                                         int xhi, int ylo, int yhi, int zlo,
                                         int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_export_list_cg(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_export_list_cg(void *buf, cellgrid_t *grid, int xlo,
                                           int xhi, int ylo, int yhi, int zlo,
                                           int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_export_list_cg(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

typedef struct pack_brick_param {
  void *buf;
  cellgrid_t *grid;
  int xlo, xhi, ylo, yhi, zlo, zhi;
} pack_brick_param_t;
#ifdef __sw_host__
extern void slave_pack_brick_forward_export_list_cg_cpe(void *);
extern void slave_unpack_brick_forward_export_list_cg_cpe(void *);
#endif
#ifdef __sw_slave__
#include "dma_macros_new.h"
#include <qthread_slave.h>
size_t pack_brick_forward_export_list_cg_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += pack_cell_forward_export_list_cg(ptr, cell);
      }
    }
  }
  return ptr - buf;
}
size_t unpack_brick_forward_export_list_cg_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += unpack_cell_forward_export_list_cg(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

#endif

void forward_comm_export_list_cg(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_forward_export_list_cg(
      mpp->send_prev, grid, 0, nlocal->x, 0, nlocal->y, 0, nn);
  nsend_next =
      pack_brick_forward_export_list_cg(mpp->send_next, grid, 0, nlocal->x, 0,
                                        nlocal->y, nlocal->z - nn, nlocal->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_forward_export_list_cg(mpp->recv_prev, grid, 0, nlocal->x, 0,
                                      nlocal->y, lo->z, lo->z + nn);
  unpack_brick_forward_export_list_cg(mpp->recv_next, grid, 0, nlocal->x, 0,
                                      nlocal->y, hi->z - nn, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_forward_export_list_cg(
      mpp->send_prev, grid, 0, nlocal->x, 0, nn, lo->z, hi->z);
  nsend_next = pack_brick_forward_export_list_cg(mpp->send_next, grid, 0,
                                                 nlocal->x, nlocal->y - nn,
                                                 nlocal->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_forward_export_list_cg(mpp->recv_prev, grid, 0, nlocal->x, lo->y,
                                      lo->y + nn, lo->z, hi->z);
  unpack_brick_forward_export_list_cg(mpp->recv_next, grid, 0, nlocal->x,
                                      hi->y - nn, hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_forward_export_list_cg(mpp->send_prev, grid, 0, nn,
                                                 lo->y, hi->y, lo->z, hi->z);
  nsend_next =
      pack_brick_forward_export_list_cg(mpp->send_next, grid, nlocal->x - nn,
                                        nlocal->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_forward_export_list_cg(mpp->recv_prev, grid, lo->x, lo->x + nn,
                                      lo->y, hi->y, lo->z, hi->z);
  unpack_brick_forward_export_list_cg(mpp->recv_next, grid, hi->x - nn, hi->x,
                                      lo->y, hi->y, lo->z, hi->z);
}

size_t pack_cell_forward_shake_cpe(void *buf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  pe_get(&cell->basis, &meta, sizeof(cellmeta_t));
  dma_syn();
  void lbuf[MAX_CELL_COMM];
  void *lptr = lbuf;
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->shake_tmp, lptr, sizeof((*cell->shake_tmp)) * meta.natom);
  lptr += sizeof((*cell->shake_tmp)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 32);
  pe_put(buf, lbuf, lptr - buf);
  dma_syn();
  return lptr - lbuf;
}
size_t pack_cell_forward_shake(void *buf, celldata_t *cell) {
  void *ptr = buf;
  pack_field(ptr, cell->shake_tmp, cell->natom);
  return ptr - buf;
}
//['shake_tmp']
size_t unpack_cell_forward_shake(void *buf, celldata_t *cell) {
  void *ptr = buf;
  unpack_field(ptr, cell->shake_tmp, cell->natom);
  return ptr - buf;
}

size_t pack_brick_forward_shake(void *buf, cellgrid_t *grid, int xlo, int xhi,
                                int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_forward_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_forward_shake(void *buf, cellgrid_t *grid, int xlo, int xhi,
                                  int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_forward_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

typedef struct pack_brick_param {
  void *buf;
  cellgrid_t *grid;
  int xlo, xhi, ylo, yhi, zlo, zhi;
} pack_brick_param_t;
#ifdef __sw_host__
extern void slave_pack_brick_forward_shake_cpe(void *);
extern void slave_unpack_brick_forward_shake_cpe(void *);
#endif
#ifdef __sw_slave__
#include "dma_macros_new.h"
#include <qthread_slave.h>
size_t pack_brick_forward_shake_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += pack_cell_forward_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}
size_t unpack_brick_forward_shake_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += unpack_cell_forward_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

#endif

void forward_comm_shake(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_forward_shake(mpp->send_prev, grid, 0, nlocal->x, 0,
                                        nlocal->y, 0, nn);
  nsend_next = pack_brick_forward_shake(mpp->send_next, grid, 0, nlocal->x, 0,
                                        nlocal->y, nlocal->z - nn, nlocal->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_forward_shake(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y,
                             lo->z, lo->z + nn);
  unpack_brick_forward_shake(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y,
                             hi->z - nn, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_forward_shake(mpp->send_prev, grid, 0, nlocal->x, 0,
                                        nn, lo->z, hi->z);
  nsend_next =
      pack_brick_forward_shake(mpp->send_next, grid, 0, nlocal->x,
                               nlocal->y - nn, nlocal->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_forward_shake(mpp->recv_prev, grid, 0, nlocal->x, lo->y,
                             lo->y + nn, lo->z, hi->z);
  unpack_brick_forward_shake(mpp->recv_next, grid, 0, nlocal->x, hi->y - nn,
                             hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_forward_shake(mpp->send_prev, grid, 0, nn, lo->y,
                                        hi->y, lo->z, hi->z);
  nsend_next = pack_brick_forward_shake(mpp->send_next, grid, nlocal->x - nn,
                                        nlocal->x, lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_forward_shake(mpp->recv_prev, grid, lo->x, lo->x + nn, lo->y,
                             hi->y, lo->z, hi->z);
  unpack_brick_forward_shake(mpp->recv_next, grid, hi->x - nn, hi->x, lo->y,
                             hi->y, lo->z, hi->z);
}

size_t pack_cell_reverse_shake_cpe(void *buf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  pe_get(&cell->basis, &meta, sizeof(cellmeta_t));
  dma_syn();
  void lbuf[MAX_CELL_COMM];
  void *lptr = lbuf;
  lptr = UP_ALIGN(lptr, 8);
  pe_get(cell->shake_tmp, lptr, sizeof((*cell->shake_tmp)) * meta.natom);
  lptr += sizeof((*cell->shake_tmp)) * meta.natom;
  dma_syn();
  lptr = UP_ALIGN(lptr, 32);
  pe_put(buf, lbuf, lptr - buf);
  dma_syn();
  return lptr - lbuf;
}
size_t pack_cell_reverse_shake(void *buf, celldata_t *cell) {
  void *ptr = buf;
  pack_field(ptr, cell->shake_tmp, cell->natom);
  return ptr - buf;
}
//['shake_tmp']
size_t unpack_cell_reverse_shake(void *buf, celldata_t *cell) {
  void *ptr = buf;
  vec<real> *shake_tmp_buf = ptr;
  for (int i = 0; i < cell->natom; i++) {
    vecaddv(cell->shake_tmp[i], cell->shake_tmp[i], shake_tmp_buf[i]);
  }
  ptr += sizeof(*(cell->shake_tmp)) * cell->natom;
  return ptr - buf;
}

size_t pack_brick_reverse_shake(void *buf, cellgrid_t *grid, int xlo, int xhi,
                                int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += pack_cell_reverse_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

size_t unpack_brick_reverse_shake(void *buf, cellgrid_t *grid, int xlo, int xhi,
                                  int ylo, int yhi, int zlo, int zhi) {
  void *ptr = buf;
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        celldata_t *cell = get_cell_xyz(grid, i, j, k);
        ptr += unpack_cell_reverse_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

typedef struct pack_brick_param {
  void *buf;
  cellgrid_t *grid;
  int xlo, xhi, ylo, yhi, zlo, zhi;
} pack_brick_param_t;
#ifdef __sw_host__
extern void slave_pack_brick_reverse_shake_cpe(void *);
extern void slave_unpack_brick_reverse_shake_cpe(void *);
#endif
#ifdef __sw_slave__
#include "dma_macros_new.h"
#include <qthread_slave.h>
size_t pack_brick_reverse_shake_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += pack_cell_reverse_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}
size_t unpack_brick_reverse_shake_cpe(pack_brick_param_t *pm) {
  pack_brick_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(pack_brick_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  void *ptr = lpm.buf;
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;
  dma_syn();
  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int cellid = ((i - xlo) * ylen + (j - ylo)) * zlen + k - zlo;
        if (cellid & 63 != _MYID)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += unpack_cell_reverse_shake(ptr, cell);
      }
    }
  }
  return ptr - buf;
}

#endif

void reverse_comm_shake(cellgrid_t *grid, mpp_t *mpp) {
  vec<int> *lo = &(grid->dim.lo);
  vec<int> *hi = &(grid->dim.hi);
  vec<int> *nlocal = &(grid->nlocal);
  int nn = grid->nn;

  size_t nsend_prev, nsend_next;

  COMM_IRECV(mpp, prev, x);
  COMM_IRECV(mpp, next, x);
  nsend_prev = pack_brick_reverse_shake(mpp->send_prev, grid, lo->x, lo->x + nn,
                                        lo->y, hi->y, lo->z, hi->z);
  nsend_next = pack_brick_reverse_shake(mpp->send_next, grid, hi->x - nn, hi->x,
                                        lo->y, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, x);
  COMM_ISEND(mpp, nsend_next, next, x);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_shake(mpp->recv_prev, grid, 0, nn, lo->y, hi->y, lo->z,
                             hi->z);
  unpack_brick_reverse_shake(mpp->recv_next, grid, nlocal->x - nn, nlocal->x,
                             lo->y, hi->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, y);
  COMM_IRECV(mpp, next, y);
  nsend_prev = pack_brick_reverse_shake(mpp->send_prev, grid, 0, nlocal->x,
                                        lo->y, lo->y + nn, lo->z, hi->z);
  nsend_next = pack_brick_reverse_shake(mpp->send_next, grid, 0, nlocal->x,
                                        hi->y - nn, hi->y, lo->z, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, y);
  COMM_ISEND(mpp, nsend_next, next, y);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_shake(mpp->recv_prev, grid, 0, nlocal->x, 0, nn, lo->z,
                             hi->z);
  unpack_brick_reverse_shake(mpp->recv_next, grid, 0, nlocal->x, nlocal->y - nn,
                             nlocal->y, lo->z, hi->z);

  COMM_IRECV(mpp, prev, z);
  COMM_IRECV(mpp, next, z);
  nsend_prev = pack_brick_reverse_shake(mpp->send_prev, grid, 0, nlocal->x, 0,
                                        nlocal->y, lo->z, lo->z + nn);
  nsend_next = pack_brick_reverse_shake(mpp->send_next, grid, 0, nlocal->x, 0,
                                        nlocal->y, hi->z - nn, hi->z);
  COMM_ISEND(mpp, nsend_prev, prev, z);
  COMM_ISEND(mpp, nsend_next, next, z);
  COMM_WAITALL(mpp);
  unpack_brick_reverse_shake(mpp->recv_prev, grid, 0, nlocal->x, 0, nlocal->y,
                             0, nn);
  unpack_brick_reverse_shake(mpp->recv_next, grid, 0, nlocal->x, 0, nlocal->y,
                             nlocal->z - nn, nlocal->z);
}
